{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d124d22-de73-436b-86cd-9b162b469be8",
   "metadata": {
    "id": "2d124d22-de73-436b-86cd-9b162b469be8"
   },
   "outputs": [],
   "source": [
    "%pip install --upgrade pip\n",
    "\n",
    "# Uninstall conflicting packages\n",
    "%pip uninstall -y langchain-core langchain-openai langchain-experimental langchain-community langchain chromadb beautifulsoup4 python-dotenv PyPDF2 rank_bm25\n",
    "\n",
    "# Install compatible versions of langchain libraries\n",
    "%pip install langchain-core==0.3.6\n",
    "%pip install langchain-openai==0.2.1\n",
    "%pip install langchain-experimental==0.3.2\n",
    "%pip install langchain-community==0.3.1\n",
    "%pip install langchain==0.3.1\n",
    "\n",
    "# Install remaining packages\n",
    "%pip install chromadb==0.5.11\n",
    "%pip install beautifulsoup4==4.12.3\n",
    "%pip install python-dotenv==1.0.1\n",
    "%pip install PyPDF2==3.0.1 -q --user\n",
    "%pip install rank_bm25==0.2.2\n",
    "\n",
    "# Restart the kernel after installation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "f884314f-870c-4bfb-b6c1-a5b4801ec172",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 4690,
     "status": "ok",
     "timestamp": 1716948148440,
     "user": {
      "displayName": "",
      "userId": ""
     },
     "user_tz": 240
    },
    "id": "f884314f-870c-4bfb-b6c1-a5b4801ec172",
    "outputId": "76ea9fdd-5ba5-48f7-be31-351b68a76355"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['USER_AGENT'] = 'RAGUserAgent'\n",
    "import openai\n",
    "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
    "from langchain import hub\n",
    "from langchain_core.output_parsers import StrOutputParser\n",
    "from langchain_core.runnables import RunnablePassthrough\n",
    "import chromadb\n",
    "from langchain_community.vectorstores import Chroma\n",
    "from langchain_core.runnables import RunnableParallel\n",
    "from dotenv import load_dotenv, find_dotenv\n",
    "from langchain_core.prompts import PromptTemplate\n",
    "from PyPDF2 import PdfReader\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain_core.documents.base import Document\n",
    "from langchain_community.retrievers import BM25Retriever\n",
    "from langchain.retrievers import EnsembleRetriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "eba3468a-d7c2-4a79-8df2-c335542950f2",
   "metadata": {
    "executionInfo": {
     "elapsed": 278,
     "status": "ok",
     "timestamp": 1716948177675,
     "user": {
      "displayName": "",
      "userId": ""
     },
     "user_tz": 240
    },
    "id": "eba3468a-d7c2-4a79-8df2-c335542950f2"
   },
   "outputs": [],
   "source": [
    "# variables\n",
    "_ = load_dotenv(dotenv_path='env.txt')\n",
    "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')\n",
    "openai.api_key = os.environ['OPENAI_API_KEY']\n",
    "embedding_function = OpenAIEmbeddings()\n",
    "llm = ChatOpenAI(model_name=\"gpt-4o\")\n",
    "pdf_path = \"google-2023-environmental-report.pdf\"\n",
    "collection_name = \"google_environmental_report\"\n",
    "str_output_parser = StrOutputParser()\n",
    "user_query = \"What are Google's environmental initiatives?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "d3ad428a-3eb6-40ec-a1a5-62565ead1e5b",
   "metadata": {
    "id": "d3ad428a-3eb6-40ec-a1a5-62565ead1e5b"
   },
   "outputs": [],
   "source": [
    "#### INDEXING ####"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "98ccda2c-0f4c-41c5-804d-2227cdf35aa7",
   "metadata": {
    "executionInfo": {
     "elapsed": 10611,
     "status": "ok",
     "timestamp": 1716948215859,
     "user": {
      "displayName": "",
      "userId": ""
     },
     "user_tz": 240
    },
    "id": "98ccda2c-0f4c-41c5-804d-2227cdf35aa7"
   },
   "outputs": [],
   "source": [
    "# PDF Loader\n",
    "docs = []\n",
    "with open(pdf_path, \"rb\") as pdf_file:\n",
    "    pdf_reader = PdfReader(pdf_file)\n",
    "    pdf_text = \"\".join(page.extract_text() for page in pdf_reader.pages)\n",
    "    docs = [Document(page_content=page) for page in pdf_text.split(\"\\n\\n\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "855b6438-6c51-4b25-b799-36fb7b592bf5",
   "metadata": {
    "id": "855b6438-6c51-4b25-b799-36fb7b592bf5"
   },
   "outputs": [],
   "source": [
    "# RecursiveCharacterTextSplitter\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "\n",
    "recursive_splitter = RecursiveCharacterTextSplitter(\n",
    "    separators=[\"\\n\\n\", \"\\n\", \". \", \" \", \"\"],\n",
    "    chunk_size=1000,\n",
    "    chunk_overlap=200\n",
    ")\n",
    "\n",
    "splits = recursive_splitter.split_documents(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec364c73-f784-4034-83d4-38d5eae62279",
   "metadata": {},
   "outputs": [],
   "source": [
    "dense_documents = [Document(page_content=doc.page_content, metadata={\"id\": str(i), \"search_source\": \"dense\"}) for i, doc in enumerate(splits)]\n",
    "sparse_documents = [Document(page_content=doc.page_content, metadata={\"id\": str(i), \"search_source\": \"sparse\"}) for i, doc in enumerate(splits)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "6b13568c-d633-464d-8c43-0d55f34cc8c1",
   "metadata": {
    "executionInfo": {
     "elapsed": 4507,
     "status": "ok",
     "timestamp": 1716948724972,
     "user": {
      "displayName": "",
      "userId": ""
     },
     "user_tz": 240
    },
    "id": "6b13568c-d633-464d-8c43-0d55f34cc8c1"
   },
   "outputs": [],
   "source": [
    "# Chroma Vector Store\n",
    "chroma_client = chromadb.Client()\n",
    "vectorstore = Chroma.from_documents(\n",
    "    documents=dense_documents,\n",
    "    embedding=embedding_function,\n",
    "    collection_name=collection_name,\n",
    "    client=chroma_client\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "749fcce7-3203-49e8-a62f-ecf2edce4570",
   "metadata": {
    "executionInfo": {
     "elapsed": 128,
     "status": "ok",
     "timestamp": 1716948726221,
     "user": {
      "displayName": "",
      "userId": ""
     },
     "user_tz": 240
    },
    "id": "749fcce7-3203-49e8-a62f-ecf2edce4570"
   },
   "outputs": [],
   "source": [
    "dense_retriever = vectorstore.as_retriever(search_kwargs={\"k\": 10})\n",
    "sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)\n",
    "ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0, k=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "6ce8df01-925b-45b5-8fb8-17b5c40c581f",
   "metadata": {
    "id": "6ce8df01-925b-45b5-8fb8-17b5c40c581f"
   },
   "outputs": [],
   "source": [
    "#### RETRIEVAL and GENERATION ####"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "fac053d8-b871-4b50-b04e-28dec9fb3b0f",
   "metadata": {
    "executionInfo": {
     "elapsed": 262,
     "status": "ok",
     "timestamp": 1716948727550,
     "user": {
      "displayName": "",
      "userId": ""
     },
     "user_tz": 240
    },
    "id": "fac053d8-b871-4b50-b04e-28dec9fb3b0f"
   },
   "outputs": [],
   "source": [
    "# Prompt - ignore LangSmith warning, you will not need langsmith for this coding exercise\n",
    "prompt = hub.pull(\"jclemens24/rag-prompt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "5ef30632-13dd-4a34-af33-cb8fab94f169",
   "metadata": {
    "executionInfo": {
     "elapsed": 116,
     "status": "ok",
     "timestamp": 1716948728500,
     "user": {
      "displayName": "",
      "userId": ""
     },
     "user_tz": 240
    },
    "id": "5ef30632-13dd-4a34-af33-cb8fab94f169"
   },
   "outputs": [],
   "source": [
    "# Relevance check prompt\n",
    "relevance_prompt_template = PromptTemplate.from_template(\n",
    "    \"\"\"\n",
    "    Given the following question and retrieved context, determine if the context is relevant to the question.\n",
    "    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.\n",
    "    Return ONLY the numeric score, without any additional text or explanation.\n",
    "\n",
    "    Question: {question}\n",
    "    Retrieved Context: {retrieved_context}\n",
    "\n",
    "    Relevance Score:\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "d549002b-6d57-4e2d-b212-0ddb9d306690",
   "metadata": {},
   "outputs": [],
   "source": [
    "#### OUTPUT PARSERS ####"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "e1f8d5fa-b799-4b1b-9cb2-151fba50b225",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_core.output_parsers import StrOutputParser\n",
    "str_output_parser = StrOutputParser()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "1e274a59-652e-4fd3-9064-b56561651966",
   "metadata": {},
   "outputs": [],
   "source": [
    "# JSON Output Parser\n",
    "from langchain_core.output_parsers import JsonOutputParser\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "from langchain_core.outputs import Generation\n",
    "import json\n",
    "\n",
    "# Define FinalOutputModel for JSON output\n",
    "class FinalOutputModel(BaseModel):\n",
    "    relevance_score: float = Field(description=\"The relevance score of the retrieved context to the question\")\n",
    "    answer: str = Field(description=\"The final answer to the question\")\n",
    "\n",
    "json_parser = JsonOutputParser(pydantic_model=FinalOutputModel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "e8975479-b3e3-481d-ad7b-08b4eb3faaef",
   "metadata": {
    "executionInfo": {
     "elapsed": 119,
     "status": "ok",
     "timestamp": 1716948730384,
     "user": {
      "displayName": "",
      "userId": ""
     },
     "user_tz": 240
    },
    "id": "e8975479-b3e3-481d-ad7b-08b4eb3faaef"
   },
   "outputs": [],
   "source": [
    "# Post-processing\n",
    "def format_docs(docs):\n",
    "    return \"\\n\\n\".join(doc.page_content for doc in docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "d6d7991f-24d0-47ea-b7ff-3ab2b8e12816",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_score(llm_output):\n",
    "    try:\n",
    "        score = float(llm_output.strip())\n",
    "        return score\n",
    "    except ValueError:\n",
    "        return 0\n",
    "\n",
    "def format_json_output(x):\n",
    "    json_output = {\n",
    "        \"relevance_score\": extract_score(x['relevance_score']),\n",
    "        \"answer\": x['answer'],\n",
    "    }\n",
    "    return json_parser.parse_result([Generation(text=json.dumps(json_output))])\n",
    "\n",
    "def conditional_answer(x):\n",
    "    relevance_score = extract_score(x['relevance_score'])\n",
    "    if relevance_score < 4:\n",
    "        return \"I don't know.\"\n",
    "    else:\n",
    "        return format_json_output(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "8369060d-cb71-4435-9a0f-4a18b54ff924",
   "metadata": {},
   "outputs": [],
   "source": [
    "rag_chain = (\n",
    "    RunnableParallel({\"context\": ensemble_retriever, \"question\": RunnablePassthrough()})\n",
    "    | RunnablePassthrough.assign(context=(lambda x: format_docs(x[\"context\"])))\n",
    "    | RunnableParallel(\n",
    "        {\n",
    "            \"relevance_score\": (\n",
    "                RunnablePassthrough()\n",
    "                | (\n",
    "                    lambda x: relevance_prompt_template.format(\n",
    "                        question=x[\"question\"], retrieved_context=x[\"context\"]\n",
    "                    )\n",
    "                )\n",
    "                | llm\n",
    "                | str_output_parser\n",
    "            ),\n",
    "            \"answer\": (\n",
    "                RunnablePassthrough()\n",
    "                | prompt\n",
    "                | llm\n",
    "                | str_output_parser\n",
    "            ),\n",
    "        }\n",
    "    )\n",
    "    | RunnablePassthrough().assign(final_result=conditional_answer)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "ac8d4fef-04ca-426c-b742-a367818209ea",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original Question: What are Google's environmental initiatives?\n",
      "\n",
      "Relevance Score: 5\n",
      "\n",
      "Final Answer:\n",
      "Google's environmental initiatives include empowering individuals to take action through sustainability features in products like Google Maps, Google Nest thermostats, and Google Flights. They aim to help individuals, cities, and partners collectively reduce 1 gigaton of carbon equivalent emissions annually by 2030. Google also works with suppliers to reduce energy consumption and greenhouse gas emissions, as well as engages in public policy advocacy for low-carbon economies. Additionally, Google is involved in initiatives like the iMasons Climate Accord, ReFED, and supporting projects with The Nature Conservancy to address environmental challenges. They also focus on operating sustainably in their own operations, such as promoting sustainable consumption of public goods and engaging with coalitions like the RE-Source Platform. Google is also working on renewable energy solutions and using data analytics tools to drive more intelligent supply chains.\n",
      "\n",
      "\n",
      "Final JSON Output:\n",
      "{'relevance_score': '5', 'answer': \"Google's environmental initiatives include empowering individuals to take action through sustainability features in products like Google Maps, Google Nest thermostats, and Google Flights. They aim to help individuals, cities, and partners collectively reduce 1 gigaton of carbon equivalent emissions annually by 2030. Google also works with suppliers to reduce energy consumption and greenhouse gas emissions, as well as engages in public policy advocacy for low-carbon economies. Additionally, Google is involved in initiatives like the iMasons Climate Accord, ReFED, and supporting projects with The Nature Conservancy to address environmental challenges. They also focus on operating sustainably in their own operations, such as promoting sustainable consumption of public goods and engaging with coalitions like the RE-Source Platform. Google is also working on renewable energy solutions and using data analytics tools to drive more intelligent supply chains.\", 'final_result': {'relevance_score': 5.0, 'answer': \"Google's environmental initiatives include empowering individuals to take action through sustainability features in products like Google Maps, Google Nest thermostats, and Google Flights. They aim to help individuals, cities, and partners collectively reduce 1 gigaton of carbon equivalent emissions annually by 2030. Google also works with suppliers to reduce energy consumption and greenhouse gas emissions, as well as engages in public policy advocacy for low-carbon economies. Additionally, Google is involved in initiatives like the iMasons Climate Accord, ReFED, and supporting projects with The Nature Conservancy to address environmental challenges. They also focus on operating sustainably in their own operations, such as promoting sustainable consumption of public goods and engaging with coalitions like the RE-Source Platform. Google is also working on renewable energy solutions and using data analytics tools to drive more intelligent supply chains.\"}}\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "result = rag_chain.invoke(user_query)\n",
    "\n",
    "print(f\"Original Question: {user_query}\\n\")\n",
    "print(f\"Relevance Score: {result['relevance_score']}\\n\")\n",
    "print(f\"Final Answer:\\n{result['final_result']['answer']}\\n\\n\")\n",
    "print(f\"Final JSON Output:\\n{result}\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f89f4d4f-cb7d-4fcb-8dc5-467a1e6bbe48",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "name": "CHAPTER11-2_TEXT_SPLITTERS.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
